#Load packages
library(data.table)
library(tidyr)
library(haven)
library(ggplot2)
library(dplyr)
library(gridExtra)
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
#Add data GPS
# Einlesen des Datensets
gps_data <- haven::read_dta("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/individual_new.dta")
# Anzeige der ersten Zeilen des Datensets
head(country_data)
# Determine the number of different countries
number_of_countries <- length(unique(gps_data$country))
# Display the number of different countries
number_of_countries
[1] 76
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
drop_na(country, isocode, risktaking, gender, age)
# Calculate the number of records removed per variable
records_removed_per_variable <- colSums(is.na(gps_data)) - colSums(is.na(cleaned_data))
# Display the cleaned data
cleaned_data
# Display the number of records removed per variable
records_removed_per_variable
country isocode ison region language date
0 0 0 0 0 0
id_gallup wgt patience risktaking posrecip negrecip
0 0 190 634 32 247
altruism trust subj_math_skills gender age
74 163 132 0 276
# List all variables
variable_list <- names(cleaned_data) # OR colnames(your_data)
# Display the list of variables
print(variable_list)
[1] "country" "isocode" "ison" "region" "language"
[6] "date" "id_gallup" "wgt" "patience" "risktaking"
[11] "posrecip" "negrecip" "altruism" "trust" "subj_math_skills"
[16] "gender" "age" "age_group" "agecat"
# Calculate the variance of the "patience" variable
variance_patience <- var(cleaned_data$patience, na.rm = TRUE)
variance_risktaking <- var(cleaned_data$risktaking, na.rm = TRUE)
variance_age <- var(cleaned_data$age, na.rm = TRUE)
# Display the variance of the "patience" variable
variance_patience
[1] 1.000596
variance_risktaking
[1] 1.000054
variance_age
[1] 304.2504
hist(cleaned_data$age, main = "Histogram of Age", xlab = "age", ylab = "Frequency")
hist(cleaned_data$patience, main = "Histogram of patience", xlab = "patience", ylab = "Frequency")
hist(cleaned_data$risktaking, main = "Histogram of risktaking", xlab = "risktaking", ylab = "Frequency")
boxplot(cleaned_data$age, main = "Boxplot of Age")
boxplot(cleaned_data$patience, main = "Boxplot of Patience")
boxplot(cleaned_data$risktaking, main = "Boxplot of Risktaking")
# Age Range
age_min <- min(cleaned_data$age, na.rm = TRUE)
age_max <- max(cleaned_data$age, na.rm = TRUE)
# Average Age
average_age <- mean(cleaned_data$age, na.rm = TRUE)
# Median Age
median_age <- median(cleaned_data$age, na.rm = TRUE)
# Display the age statistics
cat("Age Range: ", age_min, " to ", age_max, "\n")
Age Range: 15 to 99
cat("Average Age: ", average_age, "\n")
Average Age: 41.73605
cat("Median Age: ", median_age, "\n")
Median Age: 40
# Calculate the counts of females (gender = 1) and males (gender = 2)
gender_counts <- table(cleaned_data$gender)
# Display the counts
cat("Number of Females: ", gender_counts[1], "\n")
Number of Females: 36024
cat("Number of Males: ", gender_counts[2], "\n")
Number of Males: 43415
# Create a table that breaks down the number of participants by country and gender
gender_by_country <- xtabs(~ country + gender, data = cleaned_data)
# Rename columns and rows for better readability
colnames(gender_by_country) <- c("Female", "Male")
rownames(gender_by_country) <- unique(cleaned_data$country)
# Display the table
gender_by_country
gender
country Female Male
Turkey 499 498
France 537 485
Netherlands 383 604
Spain 434 554
Italy 382 609
Poland 480 513
Hungary 408 582
Czech Republic 416 575
Romania 406 594
Sweden 402 596
Greece 327 671
China 485 515
Venezuela 480 497
Kenya 411 592
Tanzania 1188 1338
Israel 366 630
Ghana 448 551
Uganda 385 559
Malawi 413 578
Australia 517 502
Sri Lanka 350 645
Cambodia 456 538
Botswana 415 572
Rwanda 447 539
Afghanistan 466 521
Georgia 510 490
Kazakhstan 438 561
Moldova 416 578
Ukraine 253 250
Cameroon 402 601
Zimbabwe 1379 1134
Costa Rica 456 539
Argentina 1301 1162
Austria 496 469
Bolivia 476 503
Bosnia Herzegovina 409 575
Chile 471 525
Colombia 499 493
Croatia 366 596
Estonia 516 482
Finland 427 560
Guatemala 412 588
Haiti 528 470
Lithuania 424 566
Nicaragua 459 532
Peru 487 497
Portugal 450 545
Serbia 559 436
Suriname 496 498
Switzerland 455 533
United States 425 575
Morocco 393 594
Saudi Arabia 383 592
Jordan 409 541
Pakistan 489 958
Indonesia 486 514
Bangladesh 519 516
United Kingdom 473 538
Germany 448 550
Iran 459 521
Japan 444 554
India 419 573
Brazil 240 250
Mexico 468 527
Nigeria 396 597
South Africa 544 455
Canada 360 639
Philippines 476 511
Vietnam 546 454
Thailand 316 666
South Korea 440 552
Russia 470 514
Algeria 516 529
Iraq 388 608
United Arab Emirates 427 550
Egypt 504 496
# Group the data by country
table_data <- gps_data %>%
group_by(country, isocode) %>%
summarize(
n = n(),
female_percentage = mean(gender == 1) * 100,
mean_age = mean(age, na.rm = TRUE),
age_range = paste(min(age, na.rm = TRUE), "-", max(age, na.rm = TRUE)),
mean_risktaking = mean(risktaking, na.rm = TRUE)
)
`summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
# Display the table
table_data
cleaned_data$agecat <- cut(
cleaned_data$age,
breaks = c(15, 20, 30, 40, 50, 60, 70, 80, Inf), # The category boundaries
labels = c("15-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"), # The category labels
right = FALSE # Left end (inclusive), right end (exclusive)
)
# Display the new column
head(cleaned_data)
# number of participants in each age category
agecat_counts <- table(cleaned_data$agecat)
# Display the number of participants in the age categories
print(agecat_counts)
15-19 20-29 30-39 40-49 50-59 60-69 70-79 80+
6888 16872 15905 13583 11374 8570 4688 1559
# Calculate the correlation matrix with NA removal
correlation_matrix <- cor(cleaned_data[, c("patience", "risktaking", "altruism", "trust")], use = "pairwise.complete.obs")
# Print the correlation matrix
print(correlation_matrix)
patience risktaking altruism trust
patience 1.00000000 0.20573586 0.08397351 0.06404146
risktaking 0.20573586 1.00000000 0.09524941 0.03517175
altruism 0.08397351 0.09524941 1.00000000 0.16734183
trust 0.06404146 0.03517175 0.16734183 1.00000000
#Preparing for linear regression
# Check for missing values in 'Country' and 'Risktaking' columns
missing_country <- anyNA(cleaned_data$country)
missing_risktaking <- anyNA(cleaned_data$risktaking)
# Print the results
cat("Missing values in 'Country': ", missing_country, "\n")
Missing values in 'Country': FALSE
cat("Missing values in 'Risktaking': ", missing_risktaking, "\n")
Missing values in 'Risktaking': FALSE
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
drop_na(country, risktaking, age)
# Split the data by country and perform linear regression for each country
regression_results <- cleaned_data %>%
group_by(country) %>%
do(model = lm(risktaking ~ age, data = .)) %>%
summarize(
country = first(country),
intercept = summary(model)$coefficients[1],
slope = summary(model)$coefficients[2],
r_squared = summary(model)$r.squared
)
# Display regression results for each country
print(regression_results)
ggplot(data = regression_results, aes(x = country, y = intercept)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Intercepts for Risktaking by Country", x = "Country", y = "Intercept Value") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
high_intercept_countries <- subset(regression_results, intercept > 0.75)
# View the countries with intercept values over 0.75
print(high_intercept_countries)
# Create scatterplots with regression lines for countries with intercept > 0.75 and smaller points
plots <- lapply(1:nrow(regression_results), function(i) {
country <- regression_results$country[i]
intercept <- regression_results$intercept[i]
if (intercept > 0.75) {
p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
geom_point(size = 0.5) + # Set the size to a smaller value (e.g., 2)
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(intercept, 2)))
print(p)
}
})
# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
Warning: The working directory was changed to /Users/laurabazzigher/Documents/GitHub/risk_wvs/code/individual_country_plots inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
for (i in seq_along(plots)) {
filename <- paste0("plot_", i, ".png")
ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}
# Switch back to the original working directory
setwd("..")
print("Individual plots for countries with intercept > 0.75 and smaller points are saved in the 'individual_country_plots' directory.")
[1] "Individual plots for countries with intercept > 0.75 and smaller points are saved in the 'individual_country_plots' directory."
regression_results <- data.frame(
country = c("Algeria", "Argentina", "Austria"),
intercept = c(0.92053422, 0.51698822, 0.42606684),
slope = c(-0.0146641801, -0.0115569623, -0.0108763042),
r_squared = c(5.232529e-02, 5.638271e-02, 3.539810e-02 )
)
# Create scatterplots with regression lines for each country
plots <- lapply(seq_len(nrow(regression_results)), function(i) {
country <- regression_results$country[i]
intercept <- regression_results$intercept[i]
slope <- regression_results$slope[i]
r_squared <- regression_results$r_squared[i]
p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(intercept, 2),
"Slope:", round(slope, 2),
"R-squared:", round(r_squared, 2)))
print(p)
})
# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
Warning: The working directory was changed to /Users/laurabazzigher/Documents/GitHub/risk_wvs/code/individual_country_plots inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
for (i in seq_along(plots)) {
filename <- paste0("plot_", i, ".png")
ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}
# Switch back to the original working directory
setwd("..")
print("Individual plots are saved in the 'individual_country_plots' directory.")
[1] "Individual plots are saved in the 'individual_country_plots' directory."
# Calculate the overall regression line
overall_lm <- lm(risktaking ~ age, data = cleaned_data) # Regression over all countries
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
geom_point(size = 0.2) + # Adjust point size
geom_smooth(method = "lm", se = FALSE, size = 0.5, linetype = "solid") + # Adjust line size and type
labs(title = "Scatterplot with Regression Lines for risktaking and age by Country",
x = "Age", y = "Risktaking") +
theme(legend.position = "bottom", # Move the legend below the graph
legend.key.size = unit(0.1, "in")) # Adjust the size of the legend key
# Calculate the overall regression line
overall_lm <- lm(risktaking ~ age, data = cleaned_data)
# Create a scatterplot with separate regression lines for each country
# and an overall regression line
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
geom_point(size = 0.5) + # Adjust point size
geom_smooth(method = "lm", se = FALSE, size = 0.5) + # Solid line for individual countries
geom_abline(intercept = coef(overall_lm)[1], slope = coef(overall_lm)[2],
color = "red", size = 1) + # Add the overall regression line in red
labs(title = "Scatterplot with Regression Lines for risktaking and age by Country",
x = "Age", y = "Risktaking") +
theme(legend.position = "bottom", # Move the legend below the graph
legend.key.size = unit(0.1, "in")) # Adjust the size of the legend key